In [1]:
import matplotlib.pyplot as plt
from matplotlib import rc
rc('font',**{'family':'monospace','monospace':['Computer Modern Typewriter']})
rc('text', usetex=True)
import matplotlib
matplotlib.rcParams["text.latex.preamble"].append(r'\usepackage{xfrac}')
%matplotlib inline
import matplotlib.mlab as mlab
from matplotlib.dates import MonthLocator, WeekdayLocator, DateFormatter
import numpy as np
import pandas
In [2]:
pediaDomains = pandas.read_csv('./Data/wp_used_domains.csv')
dataDomains = pandas.read_csv('./Data/wd_used_domains.csv')
topPediaDomains = pandas.read_csv('./Data/wp_used_tld.csv')
topDataDomains = pandas.read_csv('./Data/wd_used_tld.csv')
topLevelDomainsScatter = pandas.read_csv('./Data/clean_tld.csv')
domains = pandas.read_csv('./Data/matching_domain_count.csv', header=None)
In [3]:
totalDomainsOccurrences = 0
for num in domains[1]:
totalDomainsOccurrences += num
length = 10
width = 0.8
fig = plt.figure()
plt.barh(range(length), np.asarray(domains[1][0:length] * 100 / totalDomainsOccurrences), width, align='center', color='b')
plt.grid(which='both')
plt.xlabel(r'$\%$ on the Total Number of Matches')
plt.ylim(-width)
plt.yticks(range(length), domains[0][0:length])
plt.tight_layout()
plt.savefig("./Figures/TopDomains.pdf", format="pdf")
plt.show()
# plotly_fig = tls.mpl_to_plotly( fig )
# print(plotly_fig)
# plot_url = py.plot_mpl(plotly_fig, filename='mpl-axes-labels')
# plot_url = py.plot_mpl(plotly_fig, filename='mpl-annotation-with-custom-font-size')
In [4]:
totalDataOccurrences = 0
for num in dataDomains['refHost']:
totalDataOccurrences += num
length = 10
width = 0.8
ax = plt.barh(range(length), np.asarray(dataDomains['refHost'][0:length] * 100 / totalDataOccurrences), width, align='center', color='#b60628', edgecolor='white', hatch="//")
plt.grid(which='both')
plt.xlabel(r'$\%$ on the Total Number of References')
plt.ylim(-width)
plt.yticks(range(length), dataDomains['index'][0:length])
plt.tight_layout()
plt.savefig("./Figures/TopDomainsData.pdf", format="pdf")
plt.show()
In [5]:
totalPediaOccurrences = 0
for num in pediaDomains['refHost']:
totalPediaOccurrences += num
length = 10
width = 0.8
ax = plt.barh(np.arange(length), np.asarray(pediaDomains['refHost'][0:length] * 100 / totalPediaOccurrences), width, align='center', color='#06b694', edgecolor='white', hatch='x')
plt.grid(which='both')
plt.xlabel(r'$\%$ on the Total Number of Citations')
plt.ylim(-width)
plt.yticks(np.arange(length), pediaDomains['index'][0:length])
plt.tight_layout()
plt.savefig("./Figures/TopDomainsPedia.pdf", format="pdf")
plt.show()
In [6]:
totalPediaOccurrences = 0
for num in topPediaDomains['citeTld']:
totalPediaOccurrences += num
length = 10
width = 0.8
ax = plt.barh(range(length), np.asarray(topPediaDomains['citeTld'][0:length] * 100 / totalPediaOccurrences), width, align='center', color='#06b694')
plt.grid(which='both')
plt.xlabel(r'$\%$ on the Total Number of Matches')
plt.ylim(-width)
plt.yticks(range(length), topPediaDomains['index'][0:length])
plt.tight_layout()
plt.savefig("./Figures/TopLevelDomainsPedia.pdf", format="pdf")
plt.show()
In [7]:
totalDataOccurrences = 0
for num in topDataDomains['refTld']:
totalDataOccurrences += num
length = 10
width = 0.8
ax1 = plt.barh(range(length), np.asarray(topDataDomains['refTld'][0:length] * 100 / totalDataOccurrences), width, align='center', color='#b60628')
plt.grid(which='both')
plt.xlabel(r'$\%$ on the Total Number of Matches')
plt.ylim(-width)
plt.yticks(np.arange(length), topDataDomains['index'][0:length])
plt.tight_layout()
plt.savefig("./Figures/TopLevelDomainsData.pdf", format="pdf")
plt.show()
In [8]:
tempPediaDomains = np.zeros(length)
for i in range(0, len(topDataDomains['index'][0:length])):
for j in range(0, len(topPediaDomains['citeTld'])):
if topDataDomains['index'][i] == topPediaDomains['index'][j]:
tempPediaDomains[i] = topPediaDomains['citeTld'][j]
length = 10
width = 0.4
ax1 = plt.barh(np.arange(length), np.asarray(topDataDomains['refTld'][0:length] * 100 / totalDataOccurrences), width, label = 'Wikidata', color='#b60628')
ax2 = plt.barh(np.arange(length) + width, tempPediaDomains * 100 / totalPediaOccurrences, width, label = 'Wikipedia', color='#06b694')
plt.legend()
plt.grid(which='both')
plt.ylim(-width)
plt.yticks(np.arange(length) + width, topDataDomains['index'][0:length])
plt.tight_layout()
plt.savefig("./Figures/TopLevelDomainsComparison.pdf", format="pdf")
plt.show()
In [9]:
# plt.plot(topLevelDomainsScatter['citeTld_pc'], topLevelDomainsScatter['refTld_pc'], "o")
# plt.plot(np.log(topLevelDomainsScatter['citeTld']), np.log(topLevelDomainsScatter['refTld']), "o")
plt.plot(topLevelDomainsScatter['citeTld'], topLevelDomainsScatter['refTld'], "o", color='b')
## Find the selected n-max points; the ones that are close to the far upper-right corner in the logarithmic scale.
selected = topLevelDomainsScatter.sort_values('refTld', ascending=False)
n = 10
for index in selected[:n].index:
# plt.text(selected['citeTld'][index], selected['refTld'][index], selected['index'][index])
# We put this IF here in order to avoid clutter with overlapping labels in the graph.
if selected['index'][index] in ['gov', 'pl']:
adjust_x = 0.5 * (10 ** np.log10(selected['citeTld'][index]))
adjust_y = 0.1 * (10 ** np.log10(selected['refTld'][index]))
plt.annotate(selected['index'][index], (selected['citeTld'][index] - adjust_x, selected['refTld'][index] + adjust_y))
print('Hi')
else:
adjust_x = 0.1 * (10 ** np.log10(selected['citeTld'][index]))
adjust_y = 0.1 * (10 ** np.log10(selected['refTld'][index]))
plt.annotate(selected['index'][index], (selected['citeTld'][index] + adjust_x, selected['refTld'][index] + adjust_y))
plt.grid(which='both', linewidth=0.2)
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Wikipedia Citations')
plt.ylabel('Wikidata References')
plt.savefig("./Figures/ScatterPlot.pdf", format="pdf")
plt.show()
In [10]:
dataTypes = pandas.read_csv('./Data/item_type_all_wd.csv')
domainTypes = pandas.read_csv('./Data/item_types_matchdom.csv')
itemTypes = pandas.read_csv('./Data/item_types.csv')
dataTypes.sort_values(by='shareType', ascending=False, inplace=True)
dataTypes.reset_index(drop=True, inplace=True)
tempDomainTypes = np.zeros(len(dataTypes))
tempItemTypes = np.zeros(len(dataTypes))
for i in range(0, len(dataTypes)):
for j in range(0, len(domainTypes)):
if dataTypes['type'][i] == domainTypes['type'][j]:
tempDomainTypes[i] = domainTypes['shareType'][j]
for j in range(0, len(itemTypes)):
if dataTypes['type'][i] == itemTypes['type'][j]:
tempItemTypes[i] = itemTypes['shareType'][j]
In [11]:
width = 0.28
plt.figure(figsize=(10, 6))
ax1 = plt.barh(np.arange(0, len(dataTypes) - 1) - width, np.asarray(dataTypes['shareType'][1:len(dataTypes)]), width, label = 'Wikidata', color='k', edgecolor='white', hatch='//')
ax3 = plt.barh(np.arange(0, len(dataTypes) - 1), tempItemTypes[1:len(dataTypes)], width, label = 'Items In our Dataset', color='#06b694', edgecolor='white', hatch='|')
ax3 = plt.barh(np.arange(0, len(dataTypes) - 1) + width, tempDomainTypes[1:len(dataTypes)], width, label = 'Items with Matching Domains', color='r', edgecolor='white', hatch='x')
plt.legend()
plt.grid(which='both')
plt.xlabel(r'$\%$ on the Total Number of Matches')
plt.ylim(-2 * width)
plt.yticks(np.arange(0, len(dataTypes)) + 0.45 * width, dataTypes['type'][1:len(dataTypes)])
plt.tight_layout()
plt.savefig("./Figures/MatchedTypes.pdf", format="pdf")
plt.show()
In [12]:
referencesLanguagesSingle = pandas.read_csv('./Data/matching_refs_unique_lang_single_ref_count.csv', header=None)
referencesLanguages = pandas.read_csv('./Data/matching_refs_unique_lang_count.csv', header=None)
referencesLanguagesSingle.sort_values(by=1, ascending=False, inplace=True)
referencesLanguagesSingle.reset_index(drop=True, inplace=True)
In [13]:
totalReferencesSingleOccurrences = 0
for num in referencesLanguagesSingle[1]:
totalReferencesSingleOccurrences += num
totalReferencesOccurrences = 0
for num in referencesLanguages[1]:
totalReferencesOccurrences += num
length = 10
tempReferencesLanguages = np.zeros(length)
for i in range(0, len(referencesLanguagesSingle[1][0:length])):
for j in range(0, len(referencesLanguages[1])):
if referencesLanguages[0][j] == referencesLanguagesSingle[0][i]:
tempReferencesLanguages[i] = referencesLanguages[1][j]
width = 0.4
ax1 = plt.barh(np.arange(length), np.asarray(referencesLanguagesSingle[1][0:length] * 100 / totalReferencesSingleOccurrences), width, label = 'Total Page Matches', color='#b60628', edgecolor='white', hatch='//')
ax2 = plt.barh(np.arange(length) + width, tempReferencesLanguages * 100 / totalReferencesOccurrences, width, label = 'Unique Page Matches', color='#06b694', edgecolor='white', hatch='x')
plt.legend()
plt.grid(which='both')
plt.xlabel(r'$\%$ on the Total Number of Matches')
plt.ylim(-width)
plt.yticks(np.arange(length) + width, referencesLanguagesSingle[0][0:length])
plt.tight_layout()
plt.savefig("./Figures/ReferencesLanguages.pdf", format="pdf")
plt.show()
In [14]:
domainsLanguagesSingle = pandas.read_csv('./Data/matching_domain_lang_single_ref_count.csv', header=None)
domainsLanguages = pandas.read_csv('./Data/matching_domain_unique_lang_count.csv', header=None)
domainsLanguagesSingle.sort_values(by=1, ascending=False, inplace=True)
domainsLanguagesSingle.reset_index(drop=True, inplace=True)
In [15]:
totalDomainsSingleOccurrences = 0
for num in domainsLanguagesSingle[1]:
totalDomainsSingleOccurrences += num
totalDomainsOccurrences = 0
for num in domainsLanguages[1]:
totalDomainsOccurrences += num
length = 10
tempDomainsLanguages = np.zeros(length)
for i in range(0, len(domainsLanguagesSingle[1][0:length])):
for j in range(0, len(domainsLanguages[1])):
if domainsLanguages[0][j] == domainsLanguagesSingle[0][i]:
tempDomainsLanguages[i] = domainsLanguages[1][j]
width = 0.4
ax1 = plt.barh(np.arange(length), np.asarray(domainsLanguagesSingle[1][0:length] * 100 / totalDomainsSingleOccurrences), width, label = 'Total Domain Matches', color='#b60628', edgecolor='white', hatch='//')
ax2 = plt.barh(np.arange(length) + width, tempDomainsLanguages * 100 / totalDomainsOccurrences, width, label = 'Unique Domain Matches', color='#06b694', edgecolor='white', hatch='x')
plt.legend()
plt.grid(which='both')
plt.xlabel(r'$\%$ on the Total Number of Matches')
plt.ylim(-width)
plt.yticks(np.arange(length) + width, domainsLanguagesSingle[0][0:length])
plt.tight_layout()
plt.savefig("./Figures/DomainsLanguages.pdf", format="pdf")
plt.show()